import os
from PyCmpltrtok.common import sep, get_dir_name_ext

sep('nltk')
import nltk
my_nltk_data_path = os.path.join(os.environ.get("HOME", "."), 'nltk_data')
nltk.data.path = [my_nltk_data_path] + nltk.data.path
# sep('Clear cache')
# nltk.data.clear_cache()
# sep('Clear cache done')
sep('nltk import and set path over')

sep('unstructured')
from unstructured.partition.pdf import partition_pdf
# from unstructured.partition.auto import partition
sep('unstructured import over')

xdir = os.path.dirname(os.path.abspath(__file__))
path = os.path.join(xdir, 'PAPER Attention is all you need, arxiv 1706.03762v7.tmp.pdf')
print(f'path=|{path}|')
_, xbase, _ = get_dir_name_ext(path)
dest = os.path.join(xdir, f'{xbase}.txt')
print(f'dest=|{dest}|')

sep('parse')
elements = partition_pdf(filename=path)
# elements = partition(filename=path)

sep('write')
text = "\n\n".join([str(el) for el in elements])
with open(dest, 'w', encoding='utf8') as f:
    f.writelines(text)

sep('over')
